library ("readxl")
library ("ggplot2") # version 3.4.4
library ("gridExtra") # version 2.3
library ("dplyr") # version 1.1.2
library ("paletteer") # version 1.4.0
library ("tibble") # version 3.2.1
library ("tidyr") # version 1.3.0
library ("ALFAM2") # version 3.7
library ("randomForest") # version 4.7.1.1
library ("xgboost") # version 1.7.6.1
library ("glmnet") # version 4.1.8
library ("treeshap") # version 0.3.1
library ("shapviz") # version 0.9.3
library ("kableExtra") # version 1.3.4Machine Learning Approaches for Ammonia Volatilization Prediction after Manure Application
axis_text_size = 22
axis_title_size = 26
title_size = 26
theme_replace(
axis.text = element_text(size = axis_text_size),
axis.title.x = element_text(size = axis_title_size, angle = 0, margin = ggplot2::margin (t = 10)),
axis.title.y = element_text(size = axis_title_size, angle = 90, margin = ggplot2::margin (r = 10)),
plot.title = element_text(size = title_size, hjust = 0.5, face = "bold"),
legend.text = element_text(size = axis_title_size),
legend.title = element_text(size = axis_title_size, face = "bold"),
strip.text = element_text(size = axis_title_size, face = "bold", color = "white"),
strip.background = element_rect(fill = "#000080"),
panel.background = element_rect(fill = "#f3faff"), panel.grid.major = element_line(colour = "white"),
panel.spacing = unit(2, "lines")
)
options(ggplot2.discrete.fill = list("#67c5a7"))
Dark2 = paletteer_d("RColorBrewer::Dark2")Data description
load (file = "scripts/processed_data/data_alfam2.Rdata")data_alfam2 %>% summarise (n = n(), .by = dataset) dataset n
1 Calibration subset 5515
2 Evaluation subset 424
plot_data_description = data_alfam2 %>%
filter (! (app.mthd == "bsth" & incorp == "shallow")) %>%
mutate (strategy = paste (app.mthd, incorp, sep = " - ")) %>%
mutate (strategy = recode (strategy, "bc - none" = "A", "bc - shallow" = "B", "bc - deep" = "C", "bsth - none" = "D", "os - none" = "E", "ts - none" = "F")) %>%
ggplot () +
geom_line (aes (x = time, y = e.rel, group = pmid), linewidth = 0.15) +
ylab ("Cumul of relative emission") +
xlab ("Time (h)") +
theme (axis.title.y = element_text (margin = ggplot2::margin (r = 25)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
# scale_color_manual (values = Dark2) +
facet_wrap (~ strategy)
plot_data_descriptiondata_alfam2 %>%
select (air.temp, wind.2m, rain.rate, rain.cum) %>%
summary air.temp wind.2m rain.rate rain.cum
Min. :-1.90 Min. : 0.05133 Min. :0.0000 Min. : 0.000
1st Qu.: 9.50 1st Qu.: 1.65665 1st Qu.:0.0000 1st Qu.: 0.000
Median :12.50 Median : 2.80000 Median :0.0000 Median : 0.000
Mean :13.28 Mean : 3.08744 Mean :0.0406 Mean : 1.131
3rd Qu.:16.85 3rd Qu.: 4.08000 3rd Qu.:0.0000 3rd Qu.: 0.400
Max. :35.24 Max. :16.78400 Max. :7.1000 Max. :55.900
data_alfam2 %>%
filter (time == max (time), .by = pmid) %>%
select (e.cum, time, tan.app, app.rate, man.dm, man.ph, t.incorp) %>%
summary e.cum time tan.app app.rate
Min. : 0.264 Min. :20.75 Min. : 10.90 Min. : 6.60
1st Qu.: 5.815 1st Qu.:48.16 1st Qu.: 36.66 1st Qu.: 18.73
Median : 10.968 Median :70.30 Median : 54.28 Median : 28.40
Mean : 16.162 Mean :62.13 Mean : 62.07 Mean : 29.76
3rd Qu.: 21.473 3rd Qu.:71.70 3rd Qu.: 80.30 3rd Qu.: 36.65
Max. :140.570 Max. :78.00 Max. :235.40 Max. :132.60
man.dm man.ph t.incorp
Min. : 1.000 Min. :6.40 Min. : 0.0000
1st Qu.: 3.790 1st Qu.:7.10 1st Qu.: 0.0000
Median : 6.755 Median :7.40 Median : 0.0833
Mean : 6.248 Mean :7.42 Mean : 1.1878
3rd Qu.: 8.150 3rd Qu.:7.70 3rd Qu.: 0.0833
Max. :13.600 Max. :8.50 Max. :48.0000
NA's :85 NA's :475
Part 1 - Model comparison
Alfam2
# We use alfam2pars01 parameter without pH, like in Hafner et al, 2019
pars <- alfam2pars01[!grepl('man.ph', names(alfam2pars01))]
alfam2_predictions = alfam2 (
pars = pars,
dat = data_alfam2 %>% select (- j.NH3, - e.cum, - e.rel, - dataset, - country),
app.name = "tan.app",
time.name = "time",
time.incorp = "t.incorp",
group = "pmid",
prep = TRUE,
warn = FALSE
)We add the true values to the prediction dataframe and we keep only the last observation of each pmid.
alfam2_predictions = alfam2_predictions %>%
select (pmid, time, e, er) %>%
mutate (truth_e = data_alfam2$e.cum) %>%
mutate (truth_er = data_alfam2$e.rel) %>%
mutate (dataset = data_alfam2$dataset,
country = data_alfam2$country,
app.mthd = data_alfam2$app.mthd) %>%
select (pmid, time, e, er, truth_e, truth_er, dataset, country, app.mthd) %>%
filter (time == max (time), .by = pmid)alfam2_predictions %>%
mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hose", "ts" = "Trailing shoe", "os" = "Open slot injection")) %>%
mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hose", "Trailing shoe", "Open slot injection"))) %>%
mutate (residuals = truth_er - er) %>%
pivot_longer (cols = c ("er", "truth_er", "residuals")) %>%
mutate (name = recode (name, "er" = "prediction", "truth_er" = "truth")) %>%
mutate (name = factor (name, levels = c ("truth", "prediction", "residuals"))) %>%
ggplot () +
geom_boxplot (aes (x = country, y = value, fill = name)) +
geom_hline (yintercept = 0, linetype = 2) +
facet_wrap (~ app.mthd, ncol = 1) +
ylab ("Relative 72h cum. emission or residual") +
labs (fill = "") +
theme (legend.position = "bottom",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))Random forest
load (file = "scripts/processed_data/data_random_forest.Rdata")data_random_forest_calibration = data_random_forest %>%
filter (dataset == "Calibration subset") %>%
select (- pmid, - e.rel, - dataset, - country)
set.seed (123)
random_forest_model = randomForest (
e.cum ~ .,
data = data_random_forest_calibration,
importance = TRUE,
mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
)plot (random_forest_model)random_forest_predictions = data_random_forest %>%
mutate (e.cum_hat = predict (
random_forest_model,
newdata = data_random_forest %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country)
)
) %>%
mutate (e.rel_hat = e.cum_hat / tan.app) %>%
mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%
select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd) Xgboost
load (file = "scripts/processed_data/data_xgboost.Rdata")set.seed (123)
xgboost_model = xgboost (
data = xgb.DMatrix (
data = data_xgboost %>%
filter (dataset == "Calibration subset") %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
label = data_xgboost %>%
filter (dataset == "Calibration subset") %>%
select (e.cum) %>%
as.matrix %>%
{.}
),
max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,
verbose = FALSE,
objective = "reg:squarederror"
)xgboost_predictions = data_xgboost %>%
mutate (e.cum_hat = predict (
xgboost_model,
newdata = data_xgboost %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix
)
) %>%
mutate (e.rel_hat = e.cum_hat / tan.app) %>%
mutate (app.mthd = recode (as.character (app.mthd), "1" = "bc", "2" = "ts", "3" = "os", "4" = "bsth")) %>%
select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd)Lasso
load (file = "scripts/processed_data/data_lasso.Rdata")set.seed (123)
lasso_model = cv.glmnet (
x = data_lasso %>%
filter (dataset == "Calibration subset") %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
y = data_lasso %>%
filter (dataset == "Calibration subset") %>%
select (e.cum) %>%
as.matrix %>%
{.},
alpha = 1
)lasso_predictions_vector = predict (
lasso_model,
data_lasso %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix
)
df_join = data_alfam2 %>% select (pmid, app.mthd) %>% distinct
lasso_predictions = data_lasso %>%
left_join (df_join, by = "pmid") %>%
mutate (e.cum_hat = exp (lasso_predictions_vector), e.cum = exp (e.cum)) %>%
mutate (e.rel_hat = e.cum_hat / tan.app) %>%
select (pmid, time, e = e.cum_hat, er = e.rel_hat, truth_e = e.cum, truth_er = e.rel, dataset, country, app.mthd) Comparison
predictions_of_all_methods = rbind (
alfam2_predictions %>% mutate (method = "alfam2"),
random_forest_predictions %>% mutate (method = "random forest"),
xgboost_predictions %>% mutate (method = "xgboost"),
lasso_predictions %>% mutate (method = "lasso")
)predictions_of_all_methods %>% head pmid time e er truth_e truth_er dataset country
1 182 44.75 21.077637 0.1726119 11.8310 0.096885 Calibration subset DK
2 183 45.20 7.305541 0.1252665 5.5152 0.094567 Calibration subset DK
3 184 45.15 14.693149 0.1867362 11.2350 0.142780 Calibration subset DK
4 185 45.50 7.945285 0.1262760 7.2675 0.115500 Calibration subset DK
5 186 43.85 14.753793 0.2585346 9.1200 0.159810 Calibration subset DK
6 187 45.25 8.523611 0.1827650 6.5170 0.139740 Calibration subset DK
app.mthd method
1 bc alfam2
2 bsth alfam2
3 bc alfam2
4 bsth alfam2
5 bc alfam2
6 bsth alfam2
Residuals
plot_residuals_complete_dataset = predictions_of_all_methods %>%
mutate (residuals = truth_er - er) %>%
mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hose", "ts" = "Trailing shoe", "os" = "Open slot injection")) %>%
mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hose", "Trailing shoe", "Open slot injection"))) %>%
ggplot () +
geom_boxplot (aes (x = country, y = residuals, fill = method)) +
geom_hline (yintercept = 0, linetype = 2) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
facet_wrap (~ app.mthd, ncol = 1) +
ylab ("Residual") +
labs (fill = "") +
theme (legend.position = "bottom",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))
plot_residuals_complete_datasetplot_residuals_evaluation_subset = predictions_of_all_methods %>%
filter (dataset == "Evaluation subset") %>%
mutate (residuals = truth_er - er) %>%
mutate (app.mthd = recode (app.mthd, "bc" = "Broadcast", "bsth" = "Trailing hose", "ts" = "Trailing shoe", "os" = "Open slot injection")) %>%
mutate (app.mthd = factor (app.mthd, levels = c ("Broadcast", "Trailing hose", "Trailing shoe", "Open slot injection"))) %>%
ggplot () +
geom_boxplot (aes (x = country, y = residuals, fill = method), varwidth = TRUE) +
geom_hline (yintercept = 0, linetype = 2) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
facet_wrap (~ app.mthd, ncol = 1) +
ylab ("Residual") +
labs (fill = "") +
theme (legend.position = "bottom",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))
plot_residuals_evaluation_subsetObserved vs predicted values
plot_observed_vs_predicted_values = predictions_of_all_methods %>%
filter (dataset == "Evaluation subset") %>%
mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%
ggplot () +
geom_point (aes (x = truth_e, y = e, color = method), size = 3) +
geom_abline (slope = 1, linetype = "dashed") +
facet_wrap (~ method) +
scale_color_manual (values = Dark2[c(2, 5, 6, 8)]) +
labs (color = "") +
theme (legend.position = "none",
axis.title.y = element_text (margin = ggplot2::margin (r = 30)),
axis.title.x = element_text (margin = ggplot2::margin (t = 15, b = 20)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
xlab ("Observed values") + ylab ("Predicted values") +
NULL
plot_observed_vs_predicted_valuesEvaluation metrics
df_evaluation_metrics = rbind (
predictions_of_all_methods %>%
select (prediction = e, truth = truth_e, dataset, method) %>%
mutate (response = "72h cum. emission"),
predictions_of_all_methods %>%
select (prediction = er, truth = truth_er, dataset, method) %>%
mutate (response = "72h relative cum. emission")
) %>%
summarise (
MSE = mean ((prediction - truth) ^ 2),
RMSE = sqrt (mean ((prediction - truth) ^ 2)),
Pearsons_r = cor (prediction, truth),
ME = 1 - (sum ( (prediction - truth) ^ 2) / sum ( (truth - mean (truth)) ^ 2)),
MAE = mean (abs (prediction - truth)),
MBE = mean (prediction - truth),
.by = c (dataset, method, response)
) %>%
mutate_if (is.numeric, round, digits = 3)
df_evaluation_metrics %>% arrange (response, dataset) %>% kable ()| dataset | method | response | MSE | RMSE | Pearsons_r | ME | MAE | MBE |
|---|---|---|---|---|---|---|---|---|
| Calibration subset | alfam2 | 72h cum. emission | 113.864 | 10.671 | 0.823 | 0.610 | 6.644 | -2.983 |
| Calibration subset | random forest | 72h cum. emission | 12.360 | 3.516 | 0.982 | 0.958 | 2.214 | 0.142 |
| Calibration subset | xgboost | 72h cum. emission | 0.535 | 0.731 | 0.999 | 0.998 | 0.260 | -0.003 |
| Calibration subset | lasso | 72h cum. emission | 122.019 | 11.046 | 0.781 | 0.582 | 6.703 | -2.561 |
| Evaluation subset | alfam2 | 72h cum. emission | 65.292 | 8.080 | 0.860 | 0.618 | 5.647 | -2.808 |
| Evaluation subset | random forest | 72h cum. emission | 20.339 | 4.510 | 0.942 | 0.881 | 3.279 | 0.924 |
| Evaluation subset | xgboost | 72h cum. emission | 38.335 | 6.192 | 0.895 | 0.776 | 4.094 | 0.507 |
| Evaluation subset | lasso | 72h cum. emission | 53.318 | 7.302 | 0.839 | 0.688 | 5.574 | -1.381 |
| Calibration subset | alfam2 | 72h relative cum. emission | 0.031 | 0.175 | 0.801 | 0.528 | 0.118 | -0.056 |
| Calibration subset | random forest | 72h relative cum. emission | 0.003 | 0.054 | 0.979 | 0.956 | 0.038 | 0.006 |
| Calibration subset | xgboost | 72h relative cum. emission | 0.000 | 0.010 | 0.999 | 0.999 | 0.004 | 0.000 |
| Calibration subset | lasso | 72h relative cum. emission | 0.027 | 0.164 | 0.787 | 0.588 | 0.114 | -0.043 |
| Evaluation subset | alfam2 | 72h relative cum. emission | 0.023 | 0.151 | 0.793 | 0.581 | 0.114 | -0.044 |
| Evaluation subset | random forest | 72h relative cum. emission | 0.011 | 0.103 | 0.915 | 0.806 | 0.076 | 0.033 |
| Evaluation subset | xgboost | 72h relative cum. emission | 0.014 | 0.116 | 0.886 | 0.752 | 0.084 | 0.008 |
| Evaluation subset | lasso | 72h relative cum. emission | 0.025 | 0.157 | 0.770 | 0.549 | 0.122 | 0.000 |
df_evaluation_metrics %>%
filter (response == "72h cum. emission") %>%
select (- MSE, - RMSE, - Pearsons_r, - ME) %>%
pivot_longer (cols = c (MAE, MBE)) %>%
arrange (name, dataset) %>%
mutate (variation = (1 - abs(value) / abs(value [method == "alfam2"])) * 100, .by = c(name, dataset)) %>%
kable ()| dataset | method | response | name | value | variation |
|---|---|---|---|---|---|
| Calibration subset | alfam2 | 72h cum. emission | MAE | 6.644 | 0.0000000 |
| Calibration subset | random forest | 72h cum. emission | MAE | 2.214 | 66.6767008 |
| Calibration subset | xgboost | 72h cum. emission | MAE | 0.260 | 96.0866948 |
| Calibration subset | lasso | 72h cum. emission | MAE | 6.703 | -0.8880193 |
| Evaluation subset | alfam2 | 72h cum. emission | MAE | 5.647 | 0.0000000 |
| Evaluation subset | random forest | 72h cum. emission | MAE | 3.279 | 41.9337701 |
| Evaluation subset | xgboost | 72h cum. emission | MAE | 4.094 | 27.5013281 |
| Evaluation subset | lasso | 72h cum. emission | MAE | 5.574 | 1.2927218 |
| Calibration subset | alfam2 | 72h cum. emission | MBE | -2.983 | 0.0000000 |
| Calibration subset | random forest | 72h cum. emission | MBE | 0.142 | 95.2396916 |
| Calibration subset | xgboost | 72h cum. emission | MBE | -0.003 | 99.8994301 |
| Calibration subset | lasso | 72h cum. emission | MBE | -2.561 | 14.1468320 |
| Evaluation subset | alfam2 | 72h cum. emission | MBE | -2.808 | 0.0000000 |
| Evaluation subset | random forest | 72h cum. emission | MBE | 0.924 | 67.0940171 |
| Evaluation subset | xgboost | 72h cum. emission | MBE | 0.507 | 81.9444444 |
| Evaluation subset | lasso | 72h cum. emission | MBE | -1.381 | 50.8190883 |
# Evaluation on evalutation subset
df_plot = df_evaluation_metrics %>%
filter (response == "72h cum. emission") %>%
select (- MSE, - RMSE) %>%
filter (dataset == "Evaluation subset") %>%
pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))
suppressWarnings(ggplot (df_plot) +
geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
facet_wrap (~ name, scales = "free", nrow = 1) +
xlab ("") + ylab ("") + labs (fill = "") +
theme (legend.position = "none ",
strip.text.y = element_text(angle = 0),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
ggtitle ("Evaluation subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))# Evaluation on calibration subset
df_plot = df_evaluation_metrics %>%
filter (response == "72h cum. emission") %>%
select (- MSE, - RMSE) %>%
filter (dataset == "Calibration subset") %>%
pivot_longer (cols = c (Pearsons_r, ME, MAE, MBE)) %>%
mutate (name = factor (name, levels = c("Pearsons_r", "ME", "MAE", "MBE")))
suppressWarnings(ggplot (df_plot) +
geom_histogram (aes (x = name, y = value, fill = method), position = "dodge", stat = "identity") +
facet_wrap (~ name, scales = "free", nrow = 1) +
xlab ("") + ylab ("") + labs (fill = "") +
theme (legend.position = "bottom",
strip.text.y = element_text(angle = 0),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
scale_fill_manual (values = Dark2[c(2, 5, 6, 8)]) +
ggtitle ("Calibration subset") + theme (plot.title = element_text (margin= ggplot2::margin(0,0,20,0), hjust = 0, size = 30)))Part 2 - New training with the whole dataset
Random forest
set.seed (123)
random_forest_model = randomForest (
e.cum ~ .,
data = data_random_forest %>% select (- pmid, - e.rel, - dataset, - country),
importance = TRUE,
mtry = 19, nodesize = 3, replace = FALSE, sample_frac = 0.8
)Shapley
df_tmp = data_random_forest_calibration %>% select (- e.cum)
unified_model_rf <- treeshap::randomForest.unify (random_forest_model, df_tmp)
treeshap_rf <- treeshap::treeshap (unified_model_rf, df_tmp, verbose = FALSE)
shapley_rf <- shapviz (treeshap_rf, X = data_random_forest_calibration %>% select (- e.cum))plot_shap_rf = shapviz::sv_importance (shapley_rf, kind = "beeswarm", max_display = 10L) +
scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
theme (
legend.title = element_text (size = 25, margin = ggplot2::margin (r = 50, b = 100)),
legend.position = "bottom",
legend.key.width = unit(3, "cm"),
axis.title.x = element_text (margin = ggplot2::margin( b = 30))
)
plot_shap_rfGain
df_importance_rf = randomForest::importance (random_forest_model) %>%
as.data.frame %>%
rownames_to_column (var = "Variable") %>%
mutate (`%IncMSE` = `%IncMSE` / max (`%IncMSE`)) %>%
mutate (type = "A")
plot_importance_rf = ggplot (df_importance_rf) +
geom_point (aes (x = `%IncMSE`, y = Variable), size = 5) +
scale_y_discrete (limits = df_importance_rf %>% arrange (`%IncMSE`) %>% pull (Variable)) +
theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
xlab ("Importance") +
ylab ("") +
facet_wrap (~ type)
plot_importance_rfdf_importance_rf %>%
arrange (desc (`%IncMSE`)) %>%
head (n = 10) %>% kable ()| Variable | %IncMSE | IncNodePurity | type |
|---|---|---|---|
| app.mthd | 1.0000000 | 18855.0746 | A |
| tan.app | 0.6135884 | 18809.3852 | A |
| man.ph | 0.5491638 | 13928.9417 | A |
| incorp | 0.4257513 | 6180.6429 | A |
| t.incorp | 0.3345969 | 4605.3732 | A |
| man.dm | 0.3251821 | 5602.9474 | A |
| app.rate | 0.2350342 | 4444.9184 | A |
| temp_6 | 0.2256594 | 1241.4233 | A |
| rain_6 | 0.2081427 | 1181.8105 | A |
| rain_1 | 0.1985322 | 371.3292 | A |
Xgboost
set.seed (123)
xgboost_model = xgboost (
data = xgb.DMatrix (
data = data_xgboost %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
label = data_xgboost %>%
select (e.cum) %>%
as.matrix %>%
{.}
),
max.depth = 6, nrounds = 300, eta = 0.3, min_child_weight = 0.5, subsample = 0.8,
verbose = FALSE,
objective = "reg:squarederror"
)Shapley
df_tmp = data_xgboost %>% select (- pmid, - e.cum, - e.rel, - dataset, - country)
unified_model_xgboost <- treeshap::xgboost.unify (xgboost_model, as.matrix(df_tmp))
treeshap_xgboost <- treeshap::treeshap (unified_model_xgboost, df_tmp, verbose = FALSE)
shapley_xgb <- shapviz::shapviz (treeshap_xgboost, X = df_tmp)plot_shap_xgboost = shapviz::sv_importance (shapley_xgb, kind = "beeswarm", max_display = 10L) +
scale_colour_gradient (low = "darkblue", high = "red", breaks = c(0, 1), labels = c("low", "high")) +
theme (legend.title = element_text (size = 30, margin = ggplot2::margin (r = 50, b = 100)),
legend.position = "bottom",
legend.key.width = unit(3, "cm"),
axis.title.x = element_text (margin = ggplot2::margin( b = 30)))
plot_shap_xgboostGain
df_importance_xgb = xgboost::xgb.importance (model = xgboost_model) %>%
mutate (Gain = Gain / max (Gain)) %>%
mutate (type = "B")
plot_importance_xgboost = df_importance_xgb %>%
ggplot() +
geom_point (aes (x = Gain, y = Feature), size = 5) +
scale_y_discrete (limits = df_importance_xgb %>% arrange (Gain) %>% pull (Feature)) +
theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
xlab ("Importance") + ylab ("") +
facet_wrap (~ type)
plot_importance_xgboostdf_importance_xgb %>%
arrange (desc (Gain)) %>%
head (n = 10) %>% kable ()| Feature | Gain | Cover | Frequency | type |
|---|---|---|---|---|
| app.mthd | 1.0000000 | 0.0144249 | 0.0210013 | B |
| tan.app | 0.9913541 | 0.0607105 | 0.1204637 | B |
| man.ph | 0.7498047 | 0.1183736 | 0.0562836 | B |
| incorp | 0.4445915 | 0.0048873 | 0.0189852 | B |
| man.dm | 0.2828499 | 0.0617338 | 0.0601478 | B |
| app.rate | 0.1648063 | 0.0317940 | 0.0460349 | B |
| time | 0.1421372 | 0.1771188 | 0.2436156 | B |
| temp_1 | 0.1310175 | 0.0494286 | 0.0443548 | B |
| temp_2 | 0.0701466 | 0.0453213 | 0.0431788 | B |
| temp_4 | 0.0579904 | 0.0107690 | 0.0120968 | B |
Lasso
set.seed (123)
lasso_model = cv.glmnet (
x = data_lasso %>%
select (- pmid, - e.cum, - e.rel, - dataset, - country) %>%
as.matrix %>%
{.},
y = data_lasso %>%
select (e.cum) %>%
as.matrix %>%
{.},
alpha = 1
)Part 3 - Testing scenarios
load (file = "scripts/processed_data/data_scenarios.Rdata")
data_scenarios = data_scenarios %>%
mutate (t.incorp = as.factor (t.incorp)) %>%
mutate (t.incorp = recode (t.incorp, "1000" = "-"))Random forest
load (file = "scripts/processed_data/data_scenarios_random_forest.Rdata")
data_tmp = data_scenarios_random_forest
random_forest_predictions_scenarios_vector = predict (
random_forest_model,
newdata = rbind (data_random_forest_calibration[1, -1], data_tmp)[- 1, ]
)
data_scenarios = data_scenarios %>%
mutate (e.cum_hat_random_forest = random_forest_predictions_scenarios_vector, .before = time) %>%
mutate (efficacy_random_forest = ((e.cum_hat_random_forest / e.cum_hat_random_forest[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain)) %>%
{.}Xgboost
load (file = "scripts/processed_data/data_scenarios_xgboost.Rdata")
xgboost_predictions_scenarios_vector = predict (
xgboost_model,
data_scenarios_xgboost %>% as.matrix
)
data_scenarios = data_scenarios %>%
mutate (e.cum_hat_xgboost = xgboost_predictions_scenarios_vector, .before = time) %>%
mutate (efficacy_xgboost = ((e.cum_hat_xgboost / e.cum_hat_xgboost[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))Lasso
load (file = "scripts/processed_data/data_scenarios_lasso.Rdata")
lasso_predictions_scenarios_vector = predict (
lasso_model,
data_scenarios_lasso %>% as.matrix
)
data_scenarios = data_scenarios %>%
mutate (e.cum_hat_lasso = lasso_predictions_scenarios_vector, .before = time) %>%
mutate (efficacy_lasso = ((e.cum_hat_lasso / e.cum_hat_lasso[app.mthd == "bc" & incorp == "none"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))Alfam2
load (file = "scripts/processed_data/data_scenarios_alfam2.Rdata")
alfam2_predictions_scenarios_vector = alfam2 (
pars = alfam2pars01,
dat = data_scenarios_alfam2,
app.name = "tan.app",
time.name = "time",
time.incorp = "t.incorp",
group = "pmid",
prep = TRUE,
warn = FALSE
) %>%
filter (time == max (time), .by = pmid) %>%
select (pmid, e.cum_hat_alfam2 = e)
data_scenarios = data_scenarios %>%
left_join (alfam2_predictions_scenarios_vector, by = "pmid") %>%
mutate (efficacy_alfam2 = ((e.cum_hat_alfam2 / e.cum_hat_alfam2[app.mthd == "bc" & incorp == "none" & t.incorp == "-"]) - 1) * 100,
.by = c (tan.app, app.rate, man.dm, man.ph, man.source, group_temp, group_wind, group_rain))Comparison
plot_scenario_predictions = data_scenarios %>%
select (app.mthd, incorp, t.incorp, man.ph,
efficacy_random_forest, efficacy_xgboost, efficacy_lasso, efficacy_alfam2) %>%
rename (efficacy_rforest = efficacy_random_forest) %>%
pivot_longer (cols = c (5 : 8), names_to = c (".value", "method"), names_pattern = "(.+)_(.+)") %>%
mutate (method = recode (method, "rforest" = "random forest")) %>%
mutate (method = recode (method, "alfam2" = "A", "lasso" = "B", "random forest" = "C", "xgboost" = "D")) %>%
mutate (group = paste (app.mthd, incorp, t.incorp)) %>%
filter (! (app.mthd == "bc" & incorp == "none")) %>%
mutate (group = recode (group,
"bc shallow 0" = "incorporation",
"bsth none -" = "trailing hose",
"os none -" = "open slot",
"ts none -" = "trailing shoe")) %>%
ggplot () +
geom_boxplot (aes (x = efficacy, y = group, fill = group)) +
scale_fill_manual (values = Dark2[c(2 : 5)]) +
ylab ("") + xlab ("Efficacy") +
theme (legend.position = "none",
axis.ticks.y = element_blank(),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0))) +
facet_wrap (~ method, ncol = 1)
plot_scenario_predictionsdata_scenarios %>%
filter (efficacy_xgboost > 0) %>%
select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
kable ()| efficacy_xgboost | tan.app | app.mthd | app.rate | man.source | incorp | group_temp | group_wind | group_rain |
|---|---|---|---|---|---|---|---|---|
| 8.9906385 | 80.3025 | ts | 36.650 | pig | none | q1 | q1 | q1 |
| 9.0057694 | 80.3025 | ts | 36.650 | cat | none | q1 | q1 | q1 |
| 0.5613908 | 80.3025 | bsth | 18.725 | pig | none | q1 | q1 | q2 |
| 0.5621355 | 80.3025 | bsth | 18.725 | cat | none | q1 | q1 | q2 |
| 25.0445690 | 80.3025 | bsth | 36.650 | pig | none | q1 | q1 | q2 |
| 18.2393264 | 80.3025 | os | 36.650 | pig | none | q1 | q1 | q2 |
| 32.8066167 | 80.3025 | ts | 36.650 | pig | none | q1 | q1 | q2 |
| 25.0855518 | 80.3025 | bsth | 36.650 | cat | none | q1 | q1 | q2 |
| 18.2691620 | 80.3025 | os | 36.650 | cat | none | q1 | q1 | q2 |
| 32.8602812 | 80.3025 | ts | 36.650 | cat | none | q1 | q1 | q2 |
| 30.7067041 | 80.3025 | bsth | 36.650 | pig | none | q1 | q2 | q2 |
| 23.4712954 | 80.3025 | os | 36.650 | pig | none | q1 | q2 | q2 |
| 25.4853738 | 80.3025 | ts | 36.650 | pig | none | q1 | q2 | q2 |
| 30.7704933 | 80.3025 | bsth | 36.650 | cat | none | q1 | q2 | q2 |
| 23.5200540 | 80.3025 | os | 36.650 | cat | none | q1 | q2 | q2 |
| 25.5383165 | 80.3025 | ts | 36.650 | cat | none | q1 | q2 | q2 |
| 4.5797139 | 36.6595 | ts | 36.650 | pig | none | q2 | q1 | q1 |
| 4.5901176 | 36.6595 | ts | 36.650 | cat | none | q2 | q1 | q1 |
| 11.1164005 | 80.3025 | ts | 36.650 | pig | none | q2 | q1 | q1 |
| 11.4099749 | 80.3025 | ts | 36.650 | cat | none | q2 | q1 | q1 |
| 9.3674813 | 36.6595 | ts | 36.650 | pig | none | q2 | q1 | q2 |
| 9.3912529 | 36.6595 | ts | 36.650 | cat | none | q2 | q1 | q2 |
| 10.4259527 | 80.3025 | bsth | 18.725 | pig | none | q2 | q1 | q2 |
| 7.6296711 | 80.3025 | os | 18.725 | pig | none | q2 | q1 | q2 |
| 14.5180492 | 80.3025 | ts | 18.725 | pig | none | q2 | q1 | q2 |
| 10.6716109 | 80.3025 | bsth | 18.725 | cat | none | q2 | q1 | q2 |
| 7.8093910 | 80.3025 | os | 18.725 | cat | none | q2 | q1 | q2 |
| 14.8600869 | 80.3025 | ts | 18.725 | cat | none | q2 | q1 | q2 |
| 15.0486933 | 80.3025 | bsth | 36.650 | pig | none | q2 | q1 | q2 |
| 11.4423307 | 80.3025 | os | 36.650 | pig | none | q2 | q1 | q2 |
| 32.0733336 | 80.3025 | ts | 36.650 | pig | none | q2 | q1 | q2 |
| 15.4434423 | 80.3025 | bsth | 36.650 | cat | none | q2 | q1 | q2 |
| 11.7424345 | 80.3025 | os | 36.650 | cat | none | q2 | q1 | q2 |
| 32.9145756 | 80.3025 | ts | 36.650 | cat | none | q2 | q1 | q2 |
| 8.9087332 | 80.3025 | ts | 36.650 | pig | none | q2 | q2 | q1 |
| 9.0807629 | 80.3025 | ts | 36.650 | cat | none | q2 | q2 | q1 |
| 2.5264251 | 80.3025 | bsth | 18.725 | pig | none | q2 | q2 | q2 |
| 0.7573102 | 80.3025 | os | 18.725 | pig | none | q2 | q2 | q2 |
| 2.5725532 | 80.3025 | bsth | 18.725 | cat | none | q2 | q2 | q2 |
| 0.7710910 | 80.3025 | os | 18.725 | cat | none | q2 | q2 | q2 |
| 30.5988491 | 80.3025 | bsth | 36.650 | pig | none | q2 | q2 | q2 |
| 27.8218260 | 80.3025 | os | 36.650 | pig | none | q2 | q2 | q2 |
| 33.8994959 | 80.3025 | ts | 36.650 | pig | none | q2 | q2 | q2 |
| 31.3037845 | 80.3025 | bsth | 36.650 | cat | none | q2 | q2 | q2 |
| 28.4628769 | 80.3025 | os | 36.650 | cat | none | q2 | q2 | q2 |
| 34.6804699 | 80.3025 | ts | 36.650 | cat | none | q2 | q2 | q2 |
data_scenarios %>%
filter (efficacy_xgboost > 0) %>%
select (tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
lapply (table)$tan.app
36.6595 80.3025
4 42
$app.mthd
bsth os ts
14 12 20
$app.rate
18.725 36.65
12 34
$man.source
cat pig
23 23
$incorp
none
46
$group_temp
q1 q2
16 30
$group_wind
q1 q2
28 18
$group_rain
q1 q2
8 38
data_tmp = data_scenarios %>%
filter (efficacy_xgboost > 0) %>%
select (efficacy_xgboost, tan.app, app.mthd, app.rate, man.source, incorp, group_temp, group_wind, group_rain) %>%
mutate (group_rain = ifelse (group_rain == "q1", "low", "high")) %>%
mutate (tan.app = paste ("TAN = ", tan.app),
app.rate = paste ("Application rate = ", app.rate),
group_rain = paste ("Rain = ", group_rain))
plot = ggplot (data_tmp) +
geom_histogram (aes (efficacy_xgboost)) +
facet_wrap (~ tan.app + app.rate + group_rain) +
xlab ("Efficacy") +
ylab ("Number of predictions") +
theme (axis.title.y = element_text (margin = ggplot2::margin (r = 20)),
strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)),
strip.text = element_text (size = 20))
plotdata_scenarios %>%
filter (! (app.mthd == "bc" & incorp == "none")) %>%
summarise (mean_rf = mean (efficacy_random_forest),
mean_alfam2 = mean (efficacy_alfam2),
mean_gbf = mean (efficacy_xgboost),
mean_lasso = mean (efficacy_lasso), .by = c ('app.mthd')) %>%
kable()| app.mthd | mean_rf | mean_alfam2 | mean_gbf | mean_lasso |
|---|---|---|---|---|
| bsth | -52.11648 | -34.29654 | -38.60492 | -23.72561 |
| os | -52.75687 | -61.75070 | -41.86650 | -42.49275 |
| ts | -50.90685 | -34.29654 | -34.98651 | -23.72561 |
| bc | -16.64187 | -43.07410 | -34.74081 | -21.99013 |
Figures
legend = cowplot::get_legend (plot_shap_rf)
plot_shap_rf_2 = plot_shap_rf +
annotate (geom = "label", label = "A", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
xlim (c (- 18, 45)) +
theme (legend.position = "none",
plot.margin = unit (c (0, 0, 1, 0), "cm"),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank())
plot_shap_xgboost_2 = plot_shap_xgboost +
annotate (geom = "label", label = "B", size = 22, fontface = "bold", x = 45, y = 1.5, fill = "#000080", color = "white") +
xlim (c (- 18, 45)) +
theme (legend.position = "none",
plot.margin = unit (c (0, 0, 1, 0), "cm"),
axis.title.x = element_text (margin = ggplot2::margin (t = 10, b = 20)))
png (file = "figures/plot_shap.png", width = 1000, height = 900)
do.call ("grid.arrange",
c (list(plot_shap_rf_2,
plot_shap_xgboost_2,
legend),
list(layout_matrix = as.matrix(c (1, 2, 3)),
heights = c (0.8, 1, 0.2))))
dev.off()png
2
png (file = "figures/plot_importance.png", width = 1300, height = 800)
grid.arrange (plot_importance_rf, plot_importance_xgboost, nrow = 1)
dev.off()png
2
Supplementary material
data = read_excel("data/Peng_Xu_et_al_2024/Supplementary_Table_3_use2775.xlsx", sheet = 2)head (data) %>% select (- Title) %>% kable()| No. | Author | Language | Year | Replicates | Fertilizer type | Nitrogen placement | Fertilizer application time | Soil tillage practices | Crop type | Water input | Tem | SOC | TN | pH | BD | Clay | CEC | Nrate | Erate |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 113 | 22.1 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 150 | 32.7 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 188 | 35.9 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 225 | 44.6 |
| 1 | Lin et al. (2015) | Chinese | 2013 | 3 | U | SBC | 1 | CT | Rice | 1289 | 26.0 | 5.4 | 0.6 | 7.9 | 1.3 | 20.0 | 18.0 | 300 | 63.1 |
| 2 | Zhu et al. (2013) | Chinese | 2012 | 3 | Others | Mix | 2 | CT | Rice | 1500 | 25.1 | 21.9 | 1.9 | 5.8 | 1.1 | 38.3 | 11.4 | 113 | 38.3 |
plot_temperature_comparison = rbind (
data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%
ggplot () +
geom_histogram (aes (x = temp)) +
facet_wrap (~ data, ncol = 1, scales = "free") +
theme (strip.text.x = element_text(margin = ggplot2::margin(t = 8, b = 8, r = 0, l = 0)))+
xlab ("Air temperature (°C)") +
ylab ("Number of observations")
plot_temperature_comparisonrbind (
data %>% select (temp = Tem) %>% mutate (data = "Data from Peng Xu et al (2024)"),
data_alfam2 %>% select (temp = air.temp) %>% mutate (data = "Data used for the alfam2 model")
) %>%
summarise (mean_temp = mean (temp), sd_temp = sd (temp), .by = data)# A tibble: 2 × 3
data mean_temp sd_temp
<chr> <dbl> <dbl>
1 Data from Peng Xu et al (2024) 20.7 6.86
2 Data used for the alfam2 model 13.3 5.48